Set-up

Built with R 4.0.3
Last saved on 05 February 2022 at 15:41

knitr::opts_chunk$set(echo = TRUE, tidy = TRUE, paged.print=TRUE, fig.width = 10, warning=FALSE, cache = TRUE)

library(car) # For data wrangling
library(caret) # For its confusion matrix function
#library(clipr) # For quick exports to other programme
library(cowplot)
library(DescTools) # For 95% CI
library(dplyr)
library(emmeans)
#library(FactoMineR) # For Shiny app
library(factoextra) # For circular graphs of variables
library(forcats) # For the fct_relevel function
library(here) # For dynamic file paths
library(ggplot2) 
library(ggthemes) # For theme of factoextra plots
library(lme4) # For linear regression modelling
library(patchwork) # To create figures with more than one plot
#library(pca3d) # For 3-D plots (not rendered in html knit)
library(PCAtools) # For nice biplots of PCA results
library(purrr) # For data wrangling
library(psych) # For various useful stats function
library(sjPlot) # For model plots and tables
library(suffrager) # For pretty feminist colour palettes :)
library(visreg) # For plots of interaction effects

source(here("R_rainclouds.R")) # For geom_flat_violin rainplots

PCA and 3-D plots

Import full data (see 7_Ref_data_prep.Rmd for data preparation steps)

data <- readRDS(here("FullMDA", "dataforPCA.rds"))

This chunk can be used to perform the MDA on various subsets of the data.

# Full dataset
data <- readRDS(here("FullMDA", "dataforPCA.rds")) 

# Subset of the data that excludes levels A and B textbooks
data <- readRDS(here("FullMDA", "dataforPCA.rds")) %>%
  filter(Level !="A" & Level != "B") %>%
  droplevels()
summary(data$Level)

# Subset of the data to only include one Country subcorpus of the TEC
data <- readRDS(here("FullMDA", "dataforPCA.rds")) %>%
  #filter(Country != "France" & Country != "Germany") %>% # Spain only
  #filter(Country != "France" & Country != "Spain") %>% # Germany only
  filter(Country != "Spain" & Country != "Germany") %>% # France only
  droplevels()
summary(data$Country)

# Perform PCA on random subset of the data to test the stability of the solution. Re-running this line will generate a new subset of 2/3 of the texts randomly sampled.
data <- readRDS(here("FullMDA", "dataforPCA.rds")) %>%
  slice_sample(n = 4980*0.6, replace = FALSE)
nrow(data)

This chunk is used to create the 3-D plots. These cannot be rendered in the html knit.

colnames(data)
pca <- prcomp(data[,9:ncol(data)], scale.=FALSE) # All quantitative variables
register <- factor(data[,"Register"]) 
corpus <- factor(data[,"Corpus"])
subcorpus <- factor(data[,"Subcorpus"])

summary(register)
summary(corpus)
summary(subcorpus)
summary(pca)

# 3-D plot

colours <- suf_palette(name = "london", n = 6, type = "continuous") 
colours2 <- suf_palette(name = "classic", n = 5, type = "continuous") 
colours <- c(colours, colours2[c(2:4)]) # Nine colours range
col6 <- colours[c(6,5,4,7,9,2)] # Good order for PCA
scales::show_col(col6)

col6 <- c("#F9B921", "#A18A33", "#722672", "#BD241E", "#267226", "#15274D")
names(col6) <- c("Textbook Conversation", "Textbook Fiction", "Textbook Informative", "Spoken BNC2014 Ref.", "Youth Fiction Ref.", "Info Teens Ref.")
shapes6 <- c(rep("cube", 3),rep("sphere", 3))
names(shapes6) <- c("Textbook Conversation", "Textbook Fiction", "Textbook Informative", "Spoken BNC2014 Ref.", "Youth Fiction Ref.", "Info Teens Ref.")

pca3d(pca, group = subcorpus, 
      components = 1:3,
      #components = 4:6,
      show.plane=FALSE,
      col = col6,
      shape = shapes6,
      radius = 0.7,
      legend = "right")

## Looking at all three Textbook English registers in one colour

col4 <- colours[c(1,3,7,9)]
col4 <- c("#EA7E1E", "#15274D", "#BD241E", "#267226")

names(col4) <- c("Textbook.English", "Informative.Teens", "Spoken.BNC2014", "Youth.Fiction")
shapes4 <- c("cube", rep("sphere", 3))
names(shapes4) <- c("Textbook.English", "Informative.Teens", "Spoken.BNC2014", "Youth.Fiction")

pca3d(pca, group = corpus, 
      show.plane=FALSE,
      components = 1:3,
      col = col4,
      shape = shapes4,
      radius = 0.7,
      legend = "right")

PCA biplots

data2 <- data %>% 
  mutate(Source = recode_factor(Corpus, Textbook.English = "Textbook English (TEC)", Informative.Teens = "Reference corpora", Spoken.BNC2014 = "Reference corpora", Youth.Fiction = "Reference corpora")) %>% 
  mutate(Corpus = fct_relevel(Subcorpus, "Info Teens Ref.", after = 9)) %>%
  relocate(Source, .after = "Corpus") %>% 
  droplevels(.)

colnames(data2)
##  [1] "Filename"  "Register"  "Level"     "Series"    "Country"   "Corpus"   
##  [7] "Source"    "Subcorpus" "Words"     "ACT"       "AMP"       "ASPECT"   
## [13] "AWL"       "BEMA"      "CAUSE"     "CC"        "COMM"      "CONC"     
## [19] "COND"      "CONT"      "CUZ"       "DEMO"      "DMA"       "DOAUX"    
## [25] "DT"        "DWNT"      "ELAB"      "EMPH"      "EX"        "EXIST"    
## [31] "FPP1P"     "FPP1S"     "FPUH"      "GTO"       "HDG"       "HGOT"     
## [37] "IN"        "JJAT"      "JJPR"      "LD"        "MDCA"      "MDCO"     
## [43] "MDMM"      "MDNE"      "MDWO"      "MDWS"      "MENTAL"    "NCOMP"    
## [49] "NN"        "OCCUR"     "PASS"      "PEAS"      "PIT"       "PLACE"    
## [55] "POLITE"    "POS"       "PROG"      "QUAN"      "QUPR"      "QUTAG"    
## [61] "RB"        "RP"        "SPLIT"     "SPP2"      "STPR"      "THATD"    
## [67] "THRC"      "THSC"      "TTR"       "VBD"       "VBG"       "VBN"      
## [73] "VIMP"      "WHQU"      "WHSC"      "XX0"       "YNQU"      "TPP3"     
## [79] "FQTI"
data2meta <- data2[,1:9]
rownames(data2meta) <- data2meta$Filename
data2meta <- data2meta %>% select(-Filename)
head(data2meta)
##                                            Register Level    Series Country
## Achievers_B1_plus_Informative_0007.txt  Informative     D Achievers   Spain
## POC_5e_Spoken_0003.txt                 Conversation     B       POC  France
## Access_4_Narrative_0013.txt                 Fiction     D    Access Germany
## NGL_1_Spoken_0002.txt                  Conversation     A       NGL Germany
## Access_1_Narrative_0005.txt                 Fiction     A    Access Germany
## NGL_2_Narrative_0007.txt                    Fiction     B       NGL Germany
##                                                       Corpus
## Achievers_B1_plus_Informative_0007.txt  Textbook Informative
## POC_5e_Spoken_0003.txt                 Textbook Conversation
## Access_4_Narrative_0013.txt                 Textbook Fiction
## NGL_1_Spoken_0002.txt                  Textbook Conversation
## Access_1_Narrative_0005.txt                 Textbook Fiction
## NGL_2_Narrative_0007.txt                    Textbook Fiction
##                                                        Source
## Achievers_B1_plus_Informative_0007.txt Textbook English (TEC)
## POC_5e_Spoken_0003.txt                 Textbook English (TEC)
## Access_4_Narrative_0013.txt            Textbook English (TEC)
## NGL_1_Spoken_0002.txt                  Textbook English (TEC)
## Access_1_Narrative_0005.txt            Textbook English (TEC)
## NGL_2_Narrative_0007.txt               Textbook English (TEC)
##                                                    Subcorpus Words
## Achievers_B1_plus_Informative_0007.txt  Textbook Informative   690
## POC_5e_Spoken_0003.txt                 Textbook Conversation   694
## Access_4_Narrative_0013.txt                 Textbook Fiction   547
## NGL_1_Spoken_0002.txt                  Textbook Conversation   927
## Access_1_Narrative_0005.txt                 Textbook Fiction   840
## NGL_2_Narrative_0007.txt                    Textbook Fiction  1127
rownames(data2) <- data2$Filename
data2num <- as.data.frame(base::t(data2[,10:ncol(data2)]))
data2num[1:5,1:5] # Check data frame format is correct
##        Achievers_B1_plus_Informative_0007.txt POC_5e_Spoken_0003.txt
## ACT                                 1.3305834             -0.8133966
## AMP                                -0.4504541              0.1376718
## ASPECT                              1.0228557             -0.4515438
## AWL                                 0.7288132             -0.7268089
## BEMA                               -0.4801820              1.1272237
##        Access_4_Narrative_0013.txt NGL_1_Spoken_0002.txt
## ACT                     -0.4707243             -1.014328
## AMP                      1.4534008             -0.561931
## ASPECT                   1.1039502             -0.749580
## AWL                     -0.5477092             -0.716272
## BEMA                     0.2456594              1.700641
##        Access_1_Narrative_0005.txt
## ACT                     0.03320131
## AMP                    -0.52977306
## ASPECT                 -0.55344739
## AWL                    -0.56332789
## BEMA                   -0.39233698
p <- PCAtools::pca(data2num, 
         metadata = data2meta,
         scale = FALSE)

p$variance[1:6]
##       PC1       PC2       PC3       PC4       PC5       PC6 
## 30.334148  7.512979  5.886828  3.415906  2.703581  2.415911
sum(p$variance[1:6])
## [1] 52.26935
# For five TEC registers
# colkey = c(`Spoken BNC2014 Ref.`="#BD241E", `Info Teens Ref.`="#15274D", `Youth Fiction Ref.`="#267226", `Textbook Fiction`="#A18A33", `Textbook Conversation`="#F9B921", `Textbook Informative` = "#722672", `Textbook Instructional` = "grey", `Textbook Personal` = "black")

# For three TEC registers
summary(data2$Corpus)
## Textbook Conversation      Textbook Fiction  Textbook Informative 
##                   565                   285                   352 
##   Spoken BNC2014 Ref.    Youth Fiction Ref.       Info Teens Ref. 
##                  1250                  1191                  1337
colkey = c(`Spoken BNC2014 Ref.`="#BD241E", `Info Teens Ref.`="#15274D", `Youth Fiction Ref.`="#267226", `Textbook Fiction`="#A18A33", `Textbook Conversation`="#F9B921", `Textbook Informative` = "#722672")

summary(data2$Source)
## Textbook English (TEC)      Reference corpora 
##                   1202                   3778
shapekey = c(`Textbook English (TEC)`=6, `Reference corpora`=1)

## Very slow, open in zoomed out window!
# Add legend manually? Yes (take it from the biplot code below), let's not waste too much time, here. Or use Evert's mvar.pairs plot (though that also requires manual axis annotation!)
PCAtools::pairsplot(p,
                 triangle = FALSE,
                 components = 1:6,
                 pointSize = 0.4,
                 shape = "Source",
                 shapekey = shapekey,
                 lab = NULL, # Otherwise will try to label each data point!
                 colby = "Corpus",
                 colkey = colkey)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

# Biplots to examine components more carefully

summary(data2$Level)
##    A    B    C    D    E Ref. 
##  152  282  306  304  158 3778
shapekey = c(A=1, B=2, C=6, D=0, E=5, `Ref.`=4)

PCAtools::biplot(p,
                 x = "PC1",
                 y = "PC2",
                 lab = NULL, # Otherwise will try to label each data point!
                 colby = "Corpus",
                 pointSize = 1,
                 colkey = colkey,
                 shape = "Level",
                 shapekey = shapekey,
                 xlim = c(min(p$rotated[, "PC1"]), max(p$rotated[, "PC1"])),
                 ylim = c(min(p$rotated[, "PC2"]), max(p$rotated[, "PC2"])),
                 showLoadings = FALSE,
                 ellipse = TRUE,
                 axisLabSize = 18,
                 legendPosition = 'right',
                 legendTitleSize = 18,
                 legendLabSize = 14, 
                 legendIconSize = 5) +
  theme(plot.margin = unit(c(0,0,0,0.2), "cm"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

#ggsave(here("Plots", "PCA_Ref3TxB_BiplotPC1_PC2.svg"), width = 12, height = 8)

# Biplots to examine components more carefully
PCAtools::biplot(p,
                 x = "PC3",
                 y = "PC4",
                 lab = NULL, # Otherwise will try to label each data point!
                 colby = "Corpus",
                 pointSize = 1,
                 colkey = colkey,
                 shape = "Level",
                 shapekey = shapekey,
                 xlim = c(min(p$rotated[, "PC3"]), max(p$rotated[, "PC3"])),
                 ylim = c(min(p$rotated[, "PC4"]), max(p$rotated[, "PC4"])),
                 showLoadings = FALSE,
                 ellipse = TRUE,
                 axisLabSize = 18,
                 legendPosition = 'right',
                 legendTitleSize = 18,
                 legendLabSize = 14, 
                 legendIconSize = 5) +
  theme(plot.margin = unit(c(0,0,0,0.2), "cm"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

ggsave(here("Plots", "PCA_Ref3TxB_BiplotPC3_PC4.svg"), width = 12, height = 8)

# Biplots to examine components more carefully
PCAtools::biplot(p,
                 x = "PC5",
                 y = "PC6",
                 lab = NULL, # Otherwise will try to label each data point!
                 colby = "Corpus",
                 pointSize = 1,
                 colkey = colkey,
                 shape = "Level",
                 shapekey = shapekey,
                 xlim = c(min(p$rotated[, "PC5"]), max(p$rotated[, "PC5"])),
                 ylim = c(min(p$rotated[, "PC6"]), max(p$rotated[, "PC6"])),
                 showLoadings = FALSE,
                 ellipse = TRUE,
                 axisLabSize = 18,
                 legendPosition = 'right',
                 legendTitleSize = 18,
                 legendLabSize = 14, 
                 legendIconSize = 5) +
  theme(plot.margin = unit(c(0,0,0,0.2), "cm"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

#ggsave(here("Plots", "PCA_Ref3TxB_BiplotPC5_PC6.svg"), width = 12, height = 8)
# Biplot with ellipses for Level rather than Register
colkey = c(A="#F9B921", B="#A18A33", C="#BD241E", D="#722672", E="#15274D", `Ref. data`= "darkgrey")
shapekey = c(`Spoken BNC2014 Ref.`=16, `Info Teens Ref.`=17, `Youth Fiction Ref.`=15, `Textbook Fiction`=0, `Textbook Conversation`=1, `Textbook Informative`=2)

PCAtools::biplot(p,
                 x = "PC3",
                 y = "PC4",
                 lab = NULL, # Otherwise will try to label each data point!
                 colby = "Level",
                 pointSize = 1,
                 colkey = colkey,
                 shape = "Corpus",
                 shapekey = shapekey,
                 xlim = c(min(p$rotated[, "PC3"]), max(p$rotated[, "PC3"])),
                 ylim = c(min(p$rotated[, "PC4"]), max(p$rotated[, "PC4"])),
                 showLoadings = FALSE,
                 ellipse = TRUE,
                 axisLabSize = 18,
                 legendPosition = 'right',
                 legendTitleSize = 18,
                 legendLabSize = 14, 
                 legendIconSize = 5) +
  theme(plot.margin = unit(c(0,0,0,0.2), "cm"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

#ggsave(here("Plots", "PCA_Ref3TxB_BiplotPC3_PC4_levels.svg"), width = 12, height = 8)

# Biplots to examine components more carefully
PCAtools::biplot(p,
                 x = "PC5",
                 y = "PC6",
                 lab = NULL, # Otherwise will try to label each data point!
                 colby = "Level",
                 pointSize = 1,
                 colkey = colkey,
                 shape = "Corpus",
                 shapekey = shapekey,
                 xlim = c(min(p$rotated[, "PC5"]), max(p$rotated[, "PC5"])),
                 ylim = c(min(p$rotated[, "PC6"]), max(p$rotated[, "PC6"])),
                 showLoadings = FALSE,
                 ellipse = TRUE,
                 axisLabSize = 18,
                 legendPosition = 'right',
                 legendTitleSize = 18,
                 legendLabSize = 14, 
                 legendIconSize = 5) +
  theme(plot.margin = unit(c(0,0,0,0.2), "cm"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

#ggsave(here("Plots", "PCA_Ref3TxB_BiplotPC5_PC6_levels.svg"), width = 12, height = 8)

Feature contributions (loadings) on each component

# data <- readRDS(here('FullMDA', 'dataforPCA.rds'))
colnames(data)
##  [1] "Filename"  "Register"  "Level"     "Series"    "Country"   "Corpus"   
##  [7] "Subcorpus" "Words"     "ACT"       "AMP"       "ASPECT"    "AWL"      
## [13] "BEMA"      "CAUSE"     "CC"        "COMM"      "CONC"      "COND"     
## [19] "CONT"      "CUZ"       "DEMO"      "DMA"       "DOAUX"     "DT"       
## [25] "DWNT"      "ELAB"      "EMPH"      "EX"        "EXIST"     "FPP1P"    
## [31] "FPP1S"     "FPUH"      "GTO"       "HDG"       "HGOT"      "IN"       
## [37] "JJAT"      "JJPR"      "LD"        "MDCA"      "MDCO"      "MDMM"     
## [43] "MDNE"      "MDWO"      "MDWS"      "MENTAL"    "NCOMP"     "NN"       
## [49] "OCCUR"     "PASS"      "PEAS"      "PIT"       "PLACE"     "POLITE"   
## [55] "POS"       "PROG"      "QUAN"      "QUPR"      "QUTAG"     "RB"       
## [61] "RP"        "SPLIT"     "SPP2"      "STPR"      "THATD"     "THRC"     
## [67] "THSC"      "TTR"       "VBD"       "VBG"       "VBN"       "VIMP"     
## [73] "WHQU"      "WHSC"      "XX0"       "YNQU"      "TPP3"      "FQTI"
pca <- prcomp(data[, 9:ncol(data)], scale. = FALSE)  # All quantitative variables

# The rotated data that represents the observations / samples is stored in
# rotated, while the variable loadings are stored in loadings

# Function that applies a cut-off point for 'relevant' loadings (not used in
# thesis)
loadings <- as.data.frame(pca$rotation[, 1:6])
smallToZero <- function(x) {
    if_else(x < 0.05 & x > -0.05, 0, x)
}
loadings %>% filter_all(any_vars(. > abs(0.05))) %>% mutate_all(smallToZero) %>% 
    round(3)
##           PC1    PC2    PC3    PC4    PC5    PC6
## ACT    -0.102  0.000  0.000  0.121 -0.123  0.320
## AMP     0.000 -0.054 -0.052  0.094  0.350 -0.185
## ASPECT -0.081  0.104  0.000  0.000 -0.061  0.091
## BEMA    0.075 -0.241  0.063  0.000  0.268 -0.157
## CAUSE  -0.084 -0.117  0.056  0.145 -0.102  0.000
## CC     -0.138 -0.126 -0.092 -0.093  0.067  0.000
## COMM    0.000  0.193  0.000  0.196 -0.118 -0.232
## COND    0.083  0.000 -0.167  0.234 -0.122  0.062
## CONT    0.223 -0.050  0.000  0.000  0.000  0.000
## CUZ     0.104 -0.145 -0.200 -0.176 -0.088  0.000
## DEMO    0.146 -0.119 -0.077  0.000  0.000  0.000
## DMA     0.202 -0.088  0.000 -0.164  0.000  0.000
## DOAUX   0.185  0.000  0.057  0.000 -0.080 -0.080
## DT      0.078  0.159 -0.240  0.000  0.246  0.213
## DWNT    0.000  0.104 -0.120  0.093  0.133  0.000
## ELAB   -0.073 -0.165  0.000  0.068  0.086  0.000
## EMPH    0.168 -0.065 -0.094  0.000  0.000 -0.099
## EX      0.055  0.000  0.000  0.000  0.357  0.191
## FPP1P   0.077  0.000  0.090  0.188  0.000  0.064
## FPP1S   0.174  0.057  0.062  0.082  0.000 -0.103
## FPUH    0.180 -0.097  0.000 -0.189  0.000  0.000
## GTO     0.142  0.000  0.000  0.000 -0.143  0.000
## HDG     0.100 -0.069 -0.138 -0.121  0.000  0.000
## HGOT    0.160  0.000  0.000 -0.108  0.000  0.098
## JJAT   -0.050 -0.126 -0.250  0.066  0.180  0.000
## JJPR    0.000 -0.172  0.000  0.213  0.240 -0.156
## LD     -0.174 -0.163  0.127  0.000 -0.173  0.000
## MDCA    0.000 -0.210  0.108  0.225  0.000  0.107
## MDCO    0.000  0.187 -0.149  0.097  0.000  0.073
## MDMM    0.000 -0.098 -0.142  0.173 -0.053  0.000
## MDNE    0.063  0.000 -0.059  0.223 -0.069  0.000
## MDWO    0.071  0.108 -0.178  0.000  0.000  0.000
## MDWS    0.056  0.000  0.000  0.246 -0.157  0.145
## MENTAL  0.109  0.000 -0.055  0.157 -0.072 -0.305
## NCOMP   0.000 -0.265  0.000  0.000 -0.205  0.213
## NN     -0.213 -0.104  0.098 -0.070  0.000  0.000
## PEAS   -0.058  0.119 -0.186  0.124  0.000 -0.081
## PIT     0.152 -0.102 -0.151 -0.069  0.058  0.134
## PLACE   0.000  0.090  0.092  0.072  0.199  0.309
## POLITE  0.086  0.000  0.196  0.105  0.000 -0.063
## POS     0.000  0.094  0.000 -0.054 -0.278 -0.135
## PROG    0.090  0.078  0.000  0.152 -0.134  0.063
## QUAN    0.171  0.000 -0.157  0.000  0.104  0.000
## QUPR    0.080  0.109 -0.124  0.205  0.000 -0.112
## QUTAG   0.147  0.000 -0.070 -0.149 -0.064  0.078
## RB      0.144  0.150 -0.182  0.073  0.053  0.000
## RP      0.000  0.216 -0.089  0.150 -0.068  0.357
## SPLIT   0.000 -0.107 -0.213  0.085 -0.058 -0.071
## SPP2    0.167 -0.066  0.098  0.163  0.000  0.000
## STPR    0.097  0.000  0.000  0.000  0.000  0.097
## THATD   0.159  0.000 -0.140  0.000 -0.162 -0.088
## THRC    0.000 -0.170 -0.151  0.000 -0.063  0.125
## THSC    0.000 -0.076 -0.269  0.065  0.000 -0.159
## TTR    -0.186  0.072  0.000  0.157  0.000  0.000
## VBD    -0.096  0.349 -0.052 -0.196  0.000  0.000
## VBG    -0.144  0.000 -0.135  0.123 -0.138  0.086
## VIMP    0.000 -0.071  0.214  0.205  0.000  0.089
## WHQU    0.134  0.000  0.202  0.065  0.000 -0.052
## WHSC   -0.091 -0.103 -0.199  0.054 -0.050 -0.090
## XX0     0.181  0.000 -0.058  0.056 -0.058 -0.094
## YNQU    0.177  0.000  0.140  0.000  0.000  0.000
## TPP3   -0.054  0.303  0.000 -0.151  0.000 -0.121
## FQTI   -0.069  0.000  0.000  0.139  0.170 -0.105
# Tabular overiew of loadings with no threshold
loadings %>% round(2)
##          PC1   PC2   PC3   PC4   PC5   PC6
## ACT    -0.10  0.01 -0.01  0.12 -0.12  0.32
## AMP     0.00 -0.05 -0.05  0.09  0.35 -0.19
## ASPECT -0.08  0.10 -0.01  0.00 -0.06  0.09
## AWL    -0.21 -0.13 -0.06 -0.01 -0.06 -0.05
## BEMA    0.08 -0.24  0.06 -0.02  0.27 -0.16
## CAUSE  -0.08 -0.12  0.06  0.14 -0.10  0.04
## CC     -0.14 -0.13 -0.09 -0.09  0.07  0.03
## COMM   -0.03  0.19  0.03  0.20 -0.12 -0.23
## CONC   -0.03 -0.03 -0.18 -0.06 -0.06 -0.10
## COND    0.08 -0.02 -0.17  0.23 -0.12  0.06
## CONT    0.22 -0.05  0.03  0.00 -0.04  0.00
## CUZ     0.10 -0.14 -0.20 -0.18 -0.09 -0.01
## DEMO    0.15 -0.12 -0.08 -0.04  0.01  0.03
## DMA     0.20 -0.09 -0.04 -0.16 -0.03  0.04
## DOAUX   0.18 -0.02  0.06  0.00 -0.08 -0.08
## DT      0.08  0.16 -0.24 -0.03  0.25  0.21
## DWNT   -0.04  0.10 -0.12  0.09  0.13 -0.02
## ELAB   -0.07 -0.17 -0.04  0.07  0.09  0.00
## EMPH    0.17 -0.07 -0.09 -0.03 -0.04 -0.10
## EX      0.06 -0.02 -0.04  0.00  0.36  0.19
## EXIST  -0.13 -0.09 -0.05  0.00  0.01 -0.02
## FPP1P   0.08 -0.02  0.09  0.19  0.05  0.06
## FPP1S   0.17  0.06  0.06  0.08 -0.04 -0.10
## FPUH    0.18 -0.10 -0.05 -0.19 -0.04  0.05
## GTO     0.14  0.00 -0.04  0.00 -0.14  0.03
## HDG     0.10 -0.07 -0.14 -0.12  0.04  0.03
## HGOT    0.16 -0.05 -0.01 -0.11 -0.04  0.10
## IN     -0.21 -0.01 -0.08  0.01  0.03  0.03
## JJAT   -0.05 -0.13 -0.25  0.07  0.18 -0.02
## JJPR   -0.03 -0.17 -0.03  0.21  0.24 -0.16
## LD     -0.17 -0.16  0.13 -0.04 -0.17 -0.02
## MDCA    0.05 -0.21  0.11  0.22  0.00  0.11
## MDCO    0.00  0.19 -0.15  0.10  0.03  0.07
## MDMM   -0.02 -0.10 -0.14  0.17 -0.05  0.00
## MDNE    0.06 -0.02 -0.06  0.22 -0.07  0.01
## MDWO    0.07  0.11 -0.18  0.04 -0.05 -0.04
## MDWS    0.06 -0.02 -0.01  0.25 -0.16  0.14
## MENTAL  0.11 -0.02 -0.05  0.16 -0.07 -0.30
## NCOMP   0.00 -0.27 -0.05  0.03 -0.21  0.21
## NN     -0.21 -0.10  0.10 -0.07 -0.04  0.01
## OCCUR  -0.13 -0.02 -0.05 -0.06  0.02 -0.01
## PASS   -0.14 -0.13 -0.09 -0.10 -0.04 -0.02
## PEAS   -0.06  0.12 -0.19  0.12 -0.04 -0.08
## PIT     0.15 -0.10 -0.15 -0.07  0.06  0.13
## PLACE   0.02  0.09  0.09  0.07  0.20  0.31
## POLITE  0.09  0.00  0.20  0.11  0.03 -0.06
## POS     0.02  0.09  0.04 -0.05 -0.28 -0.13
## PROG    0.09  0.08 -0.04  0.15 -0.13  0.06
## QUAN    0.17 -0.01 -0.16  0.01  0.10  0.02
## QUPR    0.08  0.11 -0.12  0.21  0.03 -0.11
## QUTAG   0.15 -0.04 -0.07 -0.15 -0.06  0.08
## RB      0.14  0.15 -0.18  0.07  0.05  0.04
## RP     -0.01  0.22 -0.09  0.15 -0.07  0.36
## SPLIT  -0.03 -0.11 -0.21  0.08 -0.06 -0.07
## SPP2    0.17 -0.07  0.10  0.16 -0.03  0.02
## STPR    0.10  0.01  0.01 -0.04  0.01  0.10
## THATD   0.16 -0.01 -0.14 -0.02 -0.16 -0.09
## THRC   -0.05 -0.17 -0.15 -0.02 -0.06  0.13
## THSC   -0.02 -0.08 -0.27  0.07 -0.05 -0.16
## TTR    -0.19  0.07 -0.02  0.16 -0.04 -0.03
## VBD    -0.10  0.35 -0.05 -0.20  0.04 -0.04
## VBG    -0.14 -0.02 -0.14  0.12 -0.14  0.09
## VBN    -0.14 -0.10 -0.08 -0.07 -0.07  0.04
## VIMP    0.01 -0.07  0.21  0.21  0.05  0.09
## WHQU    0.13 -0.02  0.20  0.07  0.00 -0.05
## WHSC   -0.09 -0.10 -0.20  0.05 -0.05 -0.09
## XX0     0.18  0.01 -0.06  0.06 -0.06 -0.09
## YNQU    0.18 -0.03  0.14 -0.02 -0.03 -0.03
## TPP3   -0.05  0.30 -0.04 -0.15 -0.05 -0.12
## FQTI   -0.07  0.03  0.01  0.14  0.17 -0.11
# write_last_clip()

Graphs of variables

factoextra::fviz_pca_var(pca,
             axes = c(1,2),
             select.var = list(contrib = 30),
             col.var = "contrib", # Colour by contributions to the PC
             gradient.cols = c("#F9B921", "#DB241E", "#722672"),
             title = "",
             repel = TRUE, # Try to avoid too much text overlapping
             ggtheme = ggthemes::theme_few())

#ggsave(here("Plots", "fviz_pca_var_PC1_PC2_Ref3Reg.svg"), width = 11, height = 9)

factoextra::fviz_pca_var(pca,
             axes = c(3,2),
             select.var = list(contrib = 30),
             col.var = "contrib", # Colour by contributions to the PC
             gradient.cols = c("#F9B921", "#DB241E", "#722672"),
             title = "",
             repel = TRUE, # Try to avoid too much text overlapping
             ggtheme = ggthemes::theme_few())

#ggsave(here("Plots", "fviz_pca_var_PC3_PC2_Ref3Reg.svg"), width = 9, height = 8)

factoextra::fviz_pca_var(pca,
             axes = c(3,4),
             select.var = list(contrib = 30),
             col.var = "contrib", # Colour by contributions to the PC
             gradient.cols = c("#F9B921", "#DB241E", "#722672"),
             title = "",
             repel = TRUE, # Try to avoid too much text overlapping
             ggtheme = ggthemes::theme_few())

#ggsave(here("Plots", "fviz_pca_var_PC3_PC4_Ref3Reg.svg"), width = 9, height = 8)

factoextra::fviz_pca_var(pca,
             axes = c(5,6),
             select.var = list(contrib = 25),
             col.var = "contrib", # Colour by contributions to the PC
             gradient.cols = c("#F9B921", "#DB241E", "#722672"),
             title = "",
             repel = TRUE, # Try to avoid too much text overlapping
             ggtheme = ggthemes::theme_few())

#ggsave(here("Plots", "fviz_pca_var_PC5_PC6_Ref3Reg.svg"), width = 9, height = 8)

Exploring feature contributions

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
ncounts <- readRDS(here("FullMDA", "counts3Reg.rds"))

ncounts %>% group_by(Register, Level) %>% summarise(median(Words), mad(Words))
## `summarise()` has grouped output by 'Register'. You can override using the `.groups` argument.
## # A tibble: 18 × 4
## # Groups:   Register [3]
##    Register     Level `median(Words)` `mad(Words)`
##    <fct>        <fct>           <dbl>        <dbl>
##  1 Conversation A                826          156.
##  2 Conversation B                828          219.
##  3 Conversation C                796          169.
##  4 Conversation D                794          146.
##  5 Conversation E                818.         161.
##  6 Conversation Ref.            8115         6028.
##  7 Fiction      A                902.         284.
##  8 Fiction      B                885          258.
##  9 Fiction      C                811          218.
## 10 Fiction      D                784.         218.
## 11 Fiction      E                841          218.
## 12 Fiction      Ref.            5950          181.
## 13 Informative  A                833          224.
## 14 Informative  B                864.         170.
## 15 Informative  C                857          162.
## 16 Informative  D                839          190.
## 17 Informative  E                862.         174.
## 18 Informative  Ref.             767          190.
ncounts %>% filter(Register == "Informative") %>% # filter(Level %in% c('C', 'D', 'E')) %>%
select(Level, VBD, PEAS) %>% group_by(Level) %>% summarise_if(is.numeric, median) %>% 
    mutate_if(is.numeric, round, 2)
## # A tibble: 6 × 3
##   Level   VBD  PEAS
##   <fct> <dbl> <dbl>
## 1 A      22.7  0   
## 2 B      32.3  1.69
## 3 C      31.6  4.26
## 4 D      39.2  7   
## 5 E      32.4  6.54
## 6 Ref.   30    4.08
cols = c("#F9B921", "#A18A33", "#BD241E", "#722672", "#15274D", "darkgrey")

boxfeature <- ncounts %>% filter(Register == "Informative") %>% # filter(Level %in% c('C', 'D', 'E')) %>%
select(Level, FPP1S, SPP2, CONT, EXIST, AWL, XX0, PASS, VBN) %>% ggplot(aes(x = Level, 
    y = CONT, colour = Level, fill = Level)) + geom_jitter(size = 0.7, alpha = 0.7) + 
    geom_boxplot(outlier.shape = NA, fatten = 2, fill = "white", alpha = 0.3) + scale_colour_manual(values = cols) + 
    theme_minimal() + theme(legend.position = "none") + xlab("")

SPP2 <- boxfeature + aes(y = SPP2) + ylim(c(0, 80))  # These y-axis limits remove individual outliers that overextend the scales and make the real differences invisible
EXIST <- boxfeature + aes(y = EXIST) + ylim(c(0, 25))
FFP1 <- boxfeature + aes(y = FPP1S) + ylim(c(0, 60))  #
AWL <- boxfeature + aes(y = AWL)
XX0 <- boxfeature + aes(y = XX0) + ylim(c(0, 25))
PASS <- boxfeature + aes(y = PASS)
VBN <- boxfeature + aes(y = VBN) + ylim(c(0, 40))

boxplots <- grid.arrange(PASS, VBN, AWL, EXIST, FFP1, SPP2, boxfeature, XX0, ncol = 2, 
    nrow = 4)

ggsave(here("Plots", "BoxplotsInformativeFeatures.svg"), plot = boxplots, dpi = 300, 
    width = 9, height = 11)

Descriptive PCA results

## PCA data <- readRDS(here('FullMDA', 'dataforPCA.rds')) colnames(data)
pca <- prcomp(data[, 9:ncol(data)], scale. = FALSE)  # All quantitative variables

## Access to the PCA results colnames(data)
res.ind <- cbind(data[, 1:8], as.data.frame(pca$x)[, 1:6])
head(res.ind)
##                                 Filename     Register Level    Series Country
## 1 Achievers_B1_plus_Informative_0007.txt  Informative     D Achievers   Spain
## 2                 POC_5e_Spoken_0003.txt Conversation     B       POC  France
## 3            Access_4_Narrative_0013.txt      Fiction     D    Access Germany
## 4                  NGL_1_Spoken_0002.txt Conversation     A       NGL Germany
## 5            Access_1_Narrative_0005.txt      Fiction     A    Access Germany
## 6               NGL_2_Narrative_0007.txt      Fiction     B       NGL Germany
##             Corpus             Subcorpus Words        PC1        PC2       PC3
## 1 Textbook.English  Textbook Informative   690 -2.9689011 -1.0213555 0.6131214
## 2 Textbook.English Textbook Conversation   694  2.9897829 -0.8224829 2.0366782
## 3 Textbook.English      Textbook Fiction   547 -0.9601937  1.7910562 0.9396939
## 4 Textbook.English Textbook Conversation   927  2.4059242 -2.2722873 4.3576009
## 5 Textbook.English      Textbook Fiction   840 -0.3931322  0.1616840 3.5972351
## 6 Textbook.English      Textbook Fiction  1127  0.0644907  1.7802591 1.4620307
##          PC4         PC5         PC6
## 1  2.1252104 -1.27551388  1.30951586
## 2  0.9981444  1.25282870  0.14820754
## 3 -0.3421301  2.89055402  0.49870249
## 4 -0.5251778  1.07419977 -0.03533588
## 5 -0.5603783  0.01291763 -0.10092869
## 6 -0.1834330  1.30320166  0.88346462
## Examine individual text values
res.ind %>% filter(PC1 < 0.2 & PC1 > -0.7 & PC2 < 0.5 & PC2 > -0.5) %>% filter(Subcorpus == 
    "Textbook Informative") %>% select(Filename, Subcorpus)
##                                          Filename            Subcorpus
## 1                      EIM_2_Informative_0003.txt Textbook Informative
## 2                      NGL_4_Informative_0007.txt Textbook Informative
## 3                GreenLine_3_Informative_0002.txt Textbook Informative
## 4          Achievers_B1_plus_Informative_0009.txt Textbook Informative
## 5                      EIM_3_Informative_0006.txt Textbook Informative
## 6                      EIM_3_Informative_0001.txt Textbook Informative
## 7                GreenLine_3_Informative_0005.txt Textbook Informative
## 8 Solutions_Pre-Intermediate_Informative_0007.txt Textbook Informative
## 9                GreenLine_4_Informative_0001.txt Textbook Informative
## Summary statistics
res.ind %>% group_by(Register, Corpus) %>% summarise_if(is.numeric, c(mean = mean, 
    sd = sd)) %>% as.data.frame()
##       Register            Corpus Words_mean   PC1_mean   PC2_mean   PC3_mean
## 1 Conversation  Textbook.English   853.4531  1.5649317 -0.5732427  1.7069082
## 2 Conversation    Spoken.BNC2014 10637.7440  3.7102869 -0.5155994 -0.6389811
## 3      Fiction  Textbook.English   847.4105 -0.4336570  1.2517651  0.9570921
## 4      Fiction     Youth.Fiction  5944.7817 -0.4868147  1.7493984 -0.2056360
## 5  Informative  Textbook.English   840.2131 -2.0502507 -0.5236450  0.3127942
## 6  Informative Informative.Teens   805.4069 -3.0642993 -0.9630382 -0.2271048
##      PC4_mean   PC5_mean     PC6_mean  Words_sd    PC1_sd    PC2_sd    PC3_sd
## 1  0.61137062  0.4607500 -0.316471091  307.1369 1.2475649 0.7361774 1.4291394
## 2 -0.52542301 -0.1993768  0.117476908 8974.1416 0.4851624 0.4693906 0.7551311
## 3  0.01757880  0.2509411 -0.067575683  208.4950 1.0472422 0.8382890 1.2243113
## 4  0.40802195  0.1411353  0.055182933  198.6376 0.8955793 0.5209732 0.8836525
## 5  0.04558038  0.2866239 -0.053563201  176.3652 1.0387846 0.9593349 1.1042635
## 6 -0.14633809 -0.2629804  0.003253908  193.3123 1.0920901 1.1873367 0.8797793
##      PC4_sd    PC5_sd    PC6_sd
## 1 0.8593066 0.8876894 0.8328930
## 2 0.5179927 0.6135364 0.6487728
## 3 0.6429595 0.8034913 0.8742541
## 4 0.5221250 0.6653948 0.6237283
## 5 0.9772219 0.9221553 0.7571509
## 6 1.1912055 0.8969322 0.9067145
res.ind %>% group_by(Subcorpus, Level) %>% summarise_if(is.numeric, c(mean = mean, 
    sd = sd)) %>% as.data.frame()
##                Subcorpus Level Words_mean   PC1_mean   PC2_mean   PC3_mean
## 1  Textbook Conversation     A   813.0227  1.9092854 -1.0167467  3.4878120
## 2  Textbook Conversation     B   955.1571  2.0034207 -0.6389392  2.0601951
## 3  Textbook Conversation     C   823.6899  1.5949529 -0.3813790  1.3871505
## 4  Textbook Conversation     D   797.8473  1.0912302 -0.4303822  0.7856455
## 5  Textbook Conversation     E   880.6667  0.8486811 -0.5899788  0.9783131
## 6       Textbook Fiction     A   886.0000  0.1283309  0.2241170  2.7351812
## 7       Textbook Fiction     B   869.3239 -0.2308450  1.2762476  1.4671184
## 8       Textbook Fiction     C   854.2069 -0.2716990  1.6316923  0.7552243
## 9       Textbook Fiction     D   801.7794 -0.8380549  1.3757628  0.2587375
## 10      Textbook Fiction     E   842.8654 -0.7514595  1.3438698  0.1681184
## 11  Textbook Informative     A   851.7143 -1.4581371 -0.9605771  1.8578094
## 12  Textbook Informative     B   834.1408 -1.8004464 -0.5374107  0.8993168
## 13  Textbook Informative     C   838.9000 -1.8729877 -0.5397556  0.2542641
## 14  Textbook Informative     D   847.6476 -2.2357094 -0.3151493 -0.1537030
## 15  Textbook Informative     E   830.6724 -2.5812125 -0.6483112 -0.2157163
## 16   Spoken BNC2014 Ref.  Ref. 10637.7440  3.7102869 -0.5155994 -0.6389811
## 17    Youth Fiction Ref.  Ref.  5944.7817 -0.4868147  1.7493984 -0.2056360
## 18       Info Teens Ref.  Ref.   805.4069 -3.0642993 -0.9630382 -0.2271048
##      PC4_mean    PC5_mean     PC6_mean  Words_sd    PC1_sd    PC2_sd    PC3_sd
## 1  -0.1695147  1.03201862 -0.124114073  154.5695 0.8944244 0.6008896 1.0268633
## 2   0.5097138  0.66650714 -0.290544059  441.9040 0.9877238 0.7118864 1.1871891
## 3   0.7486503  0.32249622 -0.282244974  279.1360 1.2022430 0.7255549 1.0449879
## 4   0.9147864  0.16735903 -0.384535442  187.9778 1.4471805 0.7204971 1.2556890
## 5   1.0595415  0.06909751 -0.671648146  325.5879 1.3201172 0.7615324 0.9047521
## 6  -0.2672265  0.65411535 -0.218156756  242.7776 0.9731705 0.6896123 0.8950683
## 7  -0.1413873  0.31629116 -0.191186618  220.0597 0.8331461 0.6120537 0.9063155
## 8   0.1153949  0.23117651 -0.122166842  198.2061 0.8922167 0.7809139 0.9633007
## 9   0.1457670  0.15179564 -0.009406557  196.3337 1.1520193 0.7901601 0.8252677
## 10  0.1550683  0.03428962  0.190272042  189.8283 1.1136676 0.7945435 0.9015517
## 11 -0.7049678  1.25570509  0.106745609  199.7658 0.8248181 0.8644780 0.6872746
## 12 -0.1896649  0.45539556 -0.054719011  178.6216 0.9734293 1.0090411 1.0645704
## 13  0.1372063  0.34670681 -0.128786716  160.2093 1.0859612 0.9827889 0.9336982
## 14  0.1251155  0.13168059  0.013371807  179.2517 0.9452589 0.9171954 0.9518653
## 15  0.4097224 -0.20053922 -0.133988094  185.5912 1.0185482 0.9107905 0.7664382
## 16 -0.5254230 -0.19937677  0.117476908 8974.1416 0.4851624 0.4693906 0.7551311
## 17  0.4080219  0.14113532  0.055182933  198.6376 0.8955793 0.5209732 0.8836525
## 18 -0.1463381 -0.26298042  0.003253908  193.3123 1.0920901 1.1873367 0.8797793
##       PC4_sd    PC5_sd    PC6_sd
## 1  0.7300862 0.7524320 0.7923821
## 2  0.8074437 0.8630538 0.6956873
## 3  0.7947628 0.8860826 0.9109988
## 4  0.7921655 0.7555659 0.8921725
## 5  0.6346730 0.8973972 0.7349033
## 6  0.8144431 0.7688781 0.5047089
## 7  0.5765336 0.7891885 0.8696473
## 8  0.6251870 0.6933910 0.8554381
## 9  0.5882694 0.8568478 0.9060935
## 10 0.5955164 0.8065782 1.0168335
## 11 0.8517101 0.9290706 0.6747449
## 12 1.0361606 0.8508655 0.7585868
## 13 1.0584275 0.7153690 0.7596570
## 14 0.8314073 0.9666093 0.7998395
## 15 0.8381935 0.8040606 0.7093144
## 16 0.5179927 0.6135364 0.6487728
## 17 0.5221250 0.6653948 0.6237283
## 18 1.1912055 0.8969322 0.9067145
library(gtsummary)
res.ind <- res.ind %>% mutate(Subsubcorpus = paste(Corpus, Register, Level, sep = "_")) %>% 
    mutate(Subsubcorpus = as.factor(Subsubcorpus))

res.ind %>% select(PC1, PC2, PC3, PC4, PC5, PC6, Subsubcorpus) %>% tbl_summary(by = Subsubcorpus, 
    digits = list(all_continuous() ~ c(2, 2)), statistic = all_continuous() ~ "{mean} ({sd})")
Characteristic Informative.Teens_Informative_Ref., N = 1,3371 Spoken.BNC2014_Conversation_Ref., N = 1,2501 Textbook.English_Conversation_A, N = 881 Textbook.English_Conversation_B, N = 1401 Textbook.English_Conversation_C, N = 1581 Textbook.English_Conversation_D, N = 1311 Textbook.English_Conversation_E, N = 481 Textbook.English_Fiction_A, N = 361 Textbook.English_Fiction_B, N = 711 Textbook.English_Fiction_C, N = 581 Textbook.English_Fiction_D, N = 681 Textbook.English_Fiction_E, N = 521 Textbook.English_Informative_A, N = 281 Textbook.English_Informative_B, N = 711 Textbook.English_Informative_C, N = 901 Textbook.English_Informative_D, N = 1051 Textbook.English_Informative_E, N = 581 Youth.Fiction_Fiction_Ref., N = 1,1911
PC1 -3.06 (1.09) 3.71 (0.49) 1.91 (0.89) 2.00 (0.99) 1.59 (1.20) 1.09 (1.45) 0.85 (1.32) 0.13 (0.97) -0.23 (0.83) -0.27 (0.89) -0.84 (1.15) -0.75 (1.11) -1.46 (0.82) -1.80 (0.97) -1.87 (1.09) -2.24 (0.95) -2.58 (1.02) -0.49 (0.90)
PC2 -0.96 (1.19) -0.52 (0.47) -1.02 (0.60) -0.64 (0.71) -0.38 (0.73) -0.43 (0.72) -0.59 (0.76) 0.22 (0.69) 1.28 (0.61) 1.63 (0.78) 1.38 (0.79) 1.34 (0.79) -0.96 (0.86) -0.54 (1.01) -0.54 (0.98) -0.32 (0.92) -0.65 (0.91) 1.75 (0.52)
PC3 -0.23 (0.88) -0.64 (0.76) 3.49 (1.03) 2.06 (1.19) 1.39 (1.04) 0.79 (1.26) 0.98 (0.90) 2.74 (0.90) 1.47 (0.91) 0.76 (0.96) 0.26 (0.83) 0.17 (0.90) 1.86 (0.69) 0.90 (1.06) 0.25 (0.93) -0.15 (0.95) -0.22 (0.77) -0.21 (0.88)
PC4 -0.15 (1.19) -0.53 (0.52) -0.17 (0.73) 0.51 (0.81) 0.75 (0.79) 0.91 (0.79) 1.06 (0.63) -0.27 (0.81) -0.14 (0.58) 0.12 (0.63) 0.15 (0.59) 0.16 (0.60) -0.70 (0.85) -0.19 (1.04) 0.14 (1.06) 0.13 (0.83) 0.41 (0.84) 0.41 (0.52)
PC5 -0.26 (0.90) -0.20 (0.61) 1.03 (0.75) 0.67 (0.86) 0.32 (0.89) 0.17 (0.76) 0.07 (0.90) 0.65 (0.77) 0.32 (0.79) 0.23 (0.69) 0.15 (0.86) 0.03 (0.81) 1.26 (0.93) 0.46 (0.85) 0.35 (0.72) 0.13 (0.97) -0.20 (0.80) 0.14 (0.67)
PC6 0.00 (0.91) 0.12 (0.65) -0.12 (0.79) -0.29 (0.70) -0.28 (0.91) -0.38 (0.89) -0.67 (0.73) -0.22 (0.50) -0.19 (0.87) -0.12 (0.86) -0.01 (0.91) 0.19 (1.02) 0.11 (0.67) -0.05 (0.76) -0.13 (0.76) 0.01 (0.80) -0.13 (0.71) 0.06 (0.62)

1 Mean (SD)

res.ind %>% select(Register, Level, PC4) %>% group_by(Register, Level) %>% summarise_if(is.numeric, 
    c(Median = median, MAD = mad))
## # A tibble: 18 × 4
## # Groups:   Register [3]
##    Register     Level  Median   MAD
##    <fct>        <fct>   <dbl> <dbl>
##  1 Conversation A     -0.0696 0.684
##  2 Conversation B      0.543  0.810
##  3 Conversation C      0.735  0.797
##  4 Conversation D      0.857  0.695
##  5 Conversation E      1.10   0.690
##  6 Conversation Ref.  -0.541  0.522
##  7 Fiction      A     -0.382  0.749
##  8 Fiction      B     -0.170  0.648
##  9 Fiction      C      0.125  0.670
## 10 Fiction      D      0.0135 0.539
## 11 Fiction      E      0.0456 0.751
## 12 Fiction      Ref.   0.404  0.499
## 13 Informative  A     -0.706  0.823
## 14 Informative  B     -0.104  1.05 
## 15 Informative  C      0.0700 1.09 
## 16 Informative  D      0.0409 0.787
## 17 Informative  E      0.369  0.697
## 18 Informative  Ref.  -0.0753 1.21
# Search for example texts to illustrate results
res.ind %>% filter(PC3 > 2.5 & PC2 < -2) %>% # filter(Register=='Conversation') %>% filter(Level == 'B') %>% filter(PC1 > 4.7)
# %>%
select(Filename, PC1, PC2, PC3) %>% mutate(across(where(is.numeric), round, 2))
##                    Filename   PC1   PC2  PC3
## 1     NGL_1_Spoken_0002.txt  2.41 -2.27 4.36
## 2  HT_5_ELF_Spoken_0003.txt  1.94 -2.30 3.49
## 3 HT_6_Informative_0001.txt -2.19 -2.36 3.11

Raincloud plots

res.ind$Subcorpus <- fct_relevel(res.ind$Subcorpus, "Spoken BNC2014 Ref.", "Textbook Conversation", "Youth Fiction Ref.", "Textbook Fiction", "Info Teens Ref.", "Textbook Informative")

colours <- suf_palette(name = "london", n = 6, type = "continuous") 
colours2 <- suf_palette(name = "classic", n = 5, type = "continuous") 
colours <- c(colours, colours2[c(2:4)]) # Nine colours range
palette <- colours[c(1,5,6,2,3,8,7,4,9)] # Good order for PCA

colours <- palette[c(1,8,9,2,7,3)]

ggplot(res.ind, aes(x=Subcorpus,y=PC1, fill = Subcorpus, colour = Subcorpus))+ # Or leave out "colour = Register" to keep the dots in black
  geom_flat_violin(position = position_nudge(x = .25, y = 0),adjust = 2, trim = FALSE)+
  geom_point(position = position_jitter(width = .15), size = .25)+
# note that here we need to set the x-variable to a numeric variable and bump it to get the boxplots to line up with the rainclouds. 
  geom_boxplot(aes(x = as.numeric(Subcorpus)+0.25, y = PC1), outlier.shape = NA, alpha = 0.3, width = .15, colour = "BLACK") +
  ylab('PC1')+ 
  theme_cowplot()+
  theme(axis.title.x=element_blank())+
  guides(fill = "none", colour = "none") +
  scale_colour_manual(values = colours)+
  scale_fill_manual(values = colours) +
  annotate(geom = "text", x = 1.5, y = -7, label = "Conversation", size = 5) +
  annotate(geom = "segment", x = 0.7, xend = 2.5, y = -6.5, yend = -6.5) +
  annotate(geom = "text", x = 3.5, y = -7, label = "Fiction", size = 5) +
  annotate(geom = "segment", x = 2.7, xend = 4.5, y = -6.5, yend = -6.5) +
  annotate(geom = "text", x = 5.7, y = -7, label = "Informative", size = 5) +
  annotate(geom = "segment", x = 4.7, xend = 6.5, y = -6.5, yend = -6.5) +
  scale_x_discrete(labels=rep(c("Reference", "Textbook"), 3))+
  scale_y_continuous(sec.axis = dup_axis(name=NULL), breaks = seq(from = -6, to = 5, by = 1))

#ggsave(here("Plots", "PC11_3RegComparison.svg"), width = 13, height = 8)
#ggsave(here("Plots", "PC1_3RegComparison.png"), width = 20, height = 15, units = "cm", dpi = 300)

Mixed-effects models predicting PC scores

library(stringr)

# Add Source variable for random effect variable in mixed effect models
res.ind <- res.ind %>% mutate(Source = case_when(Corpus == "Youth.Fiction" ~ paste("Book", 
    str_extract(Filename, "[0-9]{1,3}"), sep = ""), Corpus == "Spoken.BNC2014" ~ 
    "Spoken.BNC2014", Corpus == "Textbook.English" ~ as.character(Series), Corpus == 
    "Informative.Teens" ~ str_extract(Filename, "BBC|Science_Tech"), TRUE ~ "NA")) %>% 
    mutate(Source = case_when(Corpus == "Informative.Teens" & is.na(Source) ~ str_remove(Filename, 
        "_.*"), TRUE ~ as.character(Source))) %>% mutate(Source = as.factor(Source)) %>% 
    mutate(Corpus = case_when(Corpus == "Textbook.English" ~ "Textbook", Corpus == 
        "Informative.Teens" ~ "Reference", Corpus == "Spoken.BNC2014" ~ "Reference", 
        Corpus == "Youth.Fiction" ~ "Reference")) %>% mutate(Corpus = as.factor(Corpus))

summary(res.ind$Source)
## Spoken.BNC2014         Access      Solutions            NGL      GreenLine 
##           1250            219            207            188            127 
##      Achievers             HT      TeenVogue       WhyFiles        History 
##            107            104            100            100             99 
##           Teen   Encyclopedia    Factmonster           Dogo      Ducksters 
##             98             97             97             96             96 
##            JTT            EIM        Science          Quatr            BBC 
##             96             95             95             93             92 
##          World       Revision   Science_Tech            POC   TweenTribute 
##             83             82             81             59             28 
##          Book1         Book10        Book100        Book101        Book102 
##              4              4              4              4              4 
##        Book103        Book104        Book105        Book106        Book107 
##              4              4              4              4              4 
##        Book108        Book109         Book11        Book110        Book111 
##              4              4              4              4              4 
##        Book112        Book113        Book114        Book115        Book116 
##              4              4              4              4              4 
##        Book117        Book118        Book119         Book12        Book120 
##              4              4              4              4              4 
##        Book121        Book122        Book123        Book124        Book125 
##              4              4              4              4              4 
##        Book126        Book127        Book128        Book129         Book13 
##              4              4              4              4              4 
##        Book130        Book131        Book132        Book133        Book134 
##              4              4              4              4              4 
##        Book135        Book136        Book137        Book138        Book139 
##              4              4              4              4              4 
##         Book14        Book140        Book141        Book142        Book143 
##              4              4              4              4              4 
##        Book144        Book145        Book146        Book147        Book148 
##              4              4              4              4              4 
##        Book149         Book15        Book150        Book151        Book152 
##              4              4              4              4              4 
##        Book153        Book154        Book155        Book156        Book157 
##              4              4              4              4              4 
##        Book158        Book159         Book16        Book160        Book161 
##              4              4              4              4              4 
##        Book162        Book163        Book164        Book165        (Other) 
##              4              4              4              4            895
# Change the reference level to a theoretically more meaningful level and one
# that is better populated (see, e.g.,
# https://stats.stackexchange.com/questions/430770/in-a-multilevel-linear-regression-how-does-the-reference-level-affect-other-lev)
summary(res.ind$Corpus)
## Reference  Textbook 
##      3778      1202
summary(res.ind$Subcorpus)
##   Spoken BNC2014 Ref. Textbook Conversation    Youth Fiction Ref. 
##                  1250                   565                  1191 
##      Textbook Fiction       Info Teens Ref.  Textbook Informative 
##                   285                  1337                   352
summary(res.ind$Level)
##    A    B    C    D    E Ref. 
##  152  282  306  304  158 3778
res.ind$Level <- relevel(res.ind$Level, "Ref.")
res.ind$Subcorpus <- factor(res.ind$Subcorpus, levels = c("Spoken BNC2014 Ref.", 
    "Textbook Conversation", "Youth Fiction Ref.", "Textbook Fiction", "Info Teens Ref.", 
    "Textbook Informative"))
res.ind$Corpus <- relevel(res.ind$Corpus, "Reference")
# Mixed-effects models PC1
md_source <- lmer(PC1 ~ 1 + (Register | Source), res.ind, REML = FALSE)
md_corpus <- update(md_source, . ~ . + Level)
md_register <- update(md_corpus, . ~ . + Register)
md_both <- update(md_corpus, . ~ . + Register)
md_interaction <- update(md_both, . ~ . + Level:Register)

anova(md_source, md_corpus, md_register, md_both, md_interaction)
## Data: res.ind
## Models:
## md_source: PC1 ~ 1 + (Register | Source)
## md_corpus: PC1 ~ (Register | Source) + Level
## md_register: PC1 ~ (Register | Source) + Level + Register
## md_both: PC1 ~ (Register | Source) + Level + Register
## md_interaction: PC1 ~ (Register | Source) + Level + Register + Level:Register
##                npar   AIC   BIC  logLik deviance   Chisq Df Pr(>Chisq)    
## md_source         8 12421 12473 -6202.5    12405                          
## md_corpus        13 12220 12305 -6097.0    12194 210.965  5  < 2.2e-16 ***
## md_register      15 12156 12254 -6063.3    12126  67.488  2  2.214e-15 ***
## md_both          15 12156 12254 -6063.3    12126   0.000  0               
## md_interaction   25 12147 12310 -6048.5    12097  29.566 10   0.001008 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(md_interaction, wrap.labels = 200)
  PC 1
Predictors Estimates CI p
(Intercept) 3.71 2.48 – 4.94 <0.001
Level [A] -1.72 -3.03 – -0.41 0.010
Level [B] -1.76 -3.06 – -0.45 0.008
Level [C] -2.07 -3.38 – -0.77 0.002
Level [D] -2.46 -3.76 – -1.15 <0.001
Level [E] -2.79 -4.11 – -1.47 <0.001
Register [Fiction] -4.20 -5.44 – -2.97 <0.001
Register [Informative] -6.75 -8.02 – -5.48 <0.001
Level [A] * Register [Fiction] 2.18 0.88 – 3.48 0.001
Level [B] * Register [Fiction] 1.79 0.51 – 3.08 0.006
Level [C] * Register [Fiction] 2.12 0.83 – 3.40 0.001
Level [D] * Register [Fiction] 1.93 0.65 – 3.21 0.003
Level [E] * Register [Fiction] 2.41 1.11 – 3.71 <0.001
Level [A] * Register [Informative] 3.31 1.96 – 4.66 <0.001
Level [B] * Register [Informative] 3.07 1.75 – 4.40 <0.001
Level [C] * Register [Informative] 3.20 1.88 – 4.52 <0.001
Level [D] * Register [Informative] 3.17 1.85 – 4.49 <0.001
Level [E] * Register [Informative] 3.21 1.87 – 4.56 <0.001
Random Effects
σ2 0.59
τ00 Source 0.40
τ11 Source.RegisterFiction 0.13
τ11 Source.RegisterInformative 0.23
ρ01 -0.06
-0.51
ICC 0.40
N Source 325
Observations 4980
Marginal R2 / Conditional R2 0.870 / 0.923

This chunk was used to check the assumptions of all the other models below.

# cf.
# https://stackoverflow.com/questions/63751541/plot-does-not-show-all-diagnostic-plots-for-lme-lmer

# check distribution of residuals
plot(md_interaction)

# scale-location plot
plot(md_interaction, sqrt(abs(resid(.))) ~ fitted(.), type = c("p", "smooth"), col.line = 1)

# Q-Q plot
lattice::qqmath(md_interaction)

# PC2
md_source <- lmer(PC2 ~ 1 + (Register | Source), res.ind, REML = FALSE)
md_corpus <- update(md_source, . ~ . + Level)
md_register <- update(md_corpus, . ~ . + Register)
md_both <- update(md_corpus, . ~ . + Register)
md_interaction <- update(md_both, . ~ . + Level:Register)

anova(md_source, md_corpus, md_register, md_both, md_interaction)
## Data: res.ind
## Models:
## md_source: PC2 ~ 1 + (Register | Source)
## md_corpus: PC2 ~ (Register | Source) + Level
## md_register: PC2 ~ (Register | Source) + Level + Register
## md_both: PC2 ~ (Register | Source) + Level + Register
## md_interaction: PC2 ~ (Register | Source) + Level + Register + Level:Register
##                npar   AIC   BIC  logLik deviance   Chisq Df Pr(>Chisq)    
## md_source         8 10755 10807 -5369.5    10739                          
## md_corpus        13 10608 10693 -5291.0    10582 156.958  5  < 2.2e-16 ***
## md_register      15 10542 10640 -5256.0    10512  70.045  2  6.165e-16 ***
## md_both          15 10542 10640 -5256.0    10512   0.000  0               
## md_interaction   25 10508 10670 -5228.8    10458  54.223 10  4.409e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(md_interaction, wrap.labels = 200)
  PC 2
Predictors Estimates CI p
(Intercept) -0.52 -0.90 – -0.14 0.008
Level [A] -0.51 -0.94 – -0.09 0.018
Level [B] -0.12 -0.54 – 0.29 0.563
Level [C] 0.12 -0.29 – 0.54 0.561
Level [D] 0.07 -0.35 – 0.49 0.737
Level [E] 0.03 -0.42 – 0.47 0.909
Register [Fiction] 2.27 1.88 – 2.65 <0.001
Register [Informative] -0.46 -0.96 – 0.03 0.068
Level [A] * Register [Fiction] -1.09 -1.58 – -0.60 <0.001
Level [B] * Register [Fiction] -0.36 -0.82 – 0.10 0.124
Level [C] * Register [Fiction] -0.30 -0.76 – 0.15 0.193
Level [D] * Register [Fiction] -0.49 -0.94 – -0.03 0.036
Level [E] * Register [Fiction] -0.45 -0.94 – 0.04 0.072
Level [A] * Register [Informative] 0.43 -0.22 – 1.07 0.194
Level [B] * Register [Informative] 0.54 -0.06 – 1.15 0.079
Level [C] * Register [Informative] 0.14 -0.47 – 0.74 0.659
Level [D] * Register [Informative] 0.45 -0.15 – 1.05 0.142
Level [E] * Register [Informative] 0.40 -0.23 – 1.03 0.215
Random Effects
σ2 0.45
τ00 Source 0.04
τ11 Source.RegisterFiction 0.04
τ11 Source.RegisterInformative 0.20
ρ01 0.24
0.89
ICC 0.28
N Source 325
Observations 4980
Marginal R2 / Conditional R2 0.661 / 0.756
# PC3 md_source <- lmer(PC3 ~ 1 + (Register|Source), res.ind, REML = FALSE) #
# Fails to converge
md_source <- lmer(PC3 ~ 1 + (1 | Source), res.ind, REML = FALSE)
md_corpus <- update(md_source, . ~ . + Level)
md_register <- update(md_corpus, . ~ . + Register)
md_both <- update(md_corpus, . ~ . + Register)
md_interaction <- update(md_both, . ~ . + Level:Register)

anova(md_source, md_corpus, md_register, md_both, md_interaction)
## Data: res.ind
## Models:
## md_source: PC3 ~ 1 + (1 | Source)
## md_corpus: PC3 ~ (1 | Source) + Level
## md_register: PC3 ~ (1 | Source) + Level + Register
## md_both: PC3 ~ (1 | Source) + Level + Register
## md_interaction: PC3 ~ (1 | Source) + Level + Register + Level:Register
##                npar   AIC   BIC  logLik deviance    Chisq Df Pr(>Chisq)    
## md_source         3 13523 13542 -6758.3    13517                           
## md_corpus         8 12083 12136 -6033.7    12067 1449.306  5    < 2e-16 ***
## md_register      10 11605 11670 -5792.5    11585  482.461  2    < 2e-16 ***
## md_both          10 11605 11670 -5792.5    11585    0.000  0               
## md_interaction   20 11608 11738 -5783.8    11568   17.312 10    0.06774 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(md_interaction, wrap.labels = 200)
  PC 3
Predictors Estimates CI p
(Intercept) -0.64 -1.99 – 0.71 0.353
Level [A] 4.27 2.84 – 5.70 <0.001
Level [B] 2.90 1.48 – 4.33 <0.001
Level [C] 2.13 0.71 – 3.56 0.003
Level [D] 1.67 0.24 – 3.09 0.022
Level [E] 1.24 -0.20 – 2.68 0.091
Register [Fiction] 0.43 -0.92 – 1.78 0.532
Register [Informative] 0.43 -0.96 – 1.82 0.543
Level [A] * Register [Fiction] -1.45 -2.83 – -0.07 0.039
Level [B] * Register [Fiction] -1.25 -2.61 – 0.12 0.074
Level [C] * Register [Fiction] -1.33 -2.70 – 0.03 0.056
Level [D] * Register [Fiction] -1.31 -2.68 – 0.06 0.060
Level [E] * Register [Fiction] -1.21 -2.59 – 0.17 0.086
Level [A] * Register [Informative] -1.99 -3.42 – -0.56 0.006
Level [B] * Register [Informative] -1.58 -2.98 – -0.17 0.028
Level [C] * Register [Informative] -1.41 -2.81 – -0.00 0.050
Level [D] * Register [Informative] -1.47 -2.88 – -0.07 0.040
Level [E] * Register [Informative] -1.51 -2.93 – -0.09 0.037
Random Effects
σ2 0.53
τ00 Source 0.47
ICC 0.47
N Source 325
Observations 4980
Marginal R2 / Conditional R2 0.423 / 0.694
# PC4 md_source <- lmer(PC4 ~ 1 + (Register|Source), res.ind, REML = FALSE) #
# Singular fit
md_source <- lmer(PC4 ~ 1 + (1 | Source), res.ind, REML = FALSE)
md_corpus <- update(md_source, . ~ . + Level)
md_register <- update(md_corpus, . ~ . + Register)
md_both <- update(md_corpus, . ~ . + Register)
md_interaction <- update(md_both, . ~ . + Level:Register)

anova(md_source, md_corpus, md_register, md_both, md_interaction)
## Data: res.ind
## Models:
## md_source: PC4 ~ 1 + (1 | Source)
## md_corpus: PC4 ~ (1 | Source) + Level
## md_register: PC4 ~ (1 | Source) + Level + Register
## md_both: PC4 ~ (1 | Source) + Level + Register
## md_interaction: PC4 ~ (1 | Source) + Level + Register + Level:Register
##                npar   AIC   BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## md_source         3 11019 11039 -5506.5    11013                         
## md_corpus         8 10832 10884 -5407.8    10816 197.35  5  < 2.2e-16 ***
## md_register      10 10575 10640 -5277.6    10555 260.55  2  < 2.2e-16 ***
## md_both          10 10575 10640 -5277.6    10555   0.00  0               
## md_interaction   20 10538 10669 -5249.2    10498  56.83 10  1.435e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(md_interaction, wrap.labels = 200)
  PC 4
Predictors Estimates CI p
(Intercept) -0.53 -1.31 – 0.26 0.187
Level [A] 0.36 -0.48 – 1.19 0.402
Level [B] 0.96 0.13 – 1.79 0.023
Level [C] 1.27 0.44 – 2.10 0.003
Level [D] 1.44 0.61 – 2.27 0.001
Level [E] 1.72 0.87 – 2.57 <0.001
Register [Fiction] 0.93 0.15 – 1.72 0.020
Register [Informative] 0.38 -0.43 – 1.19 0.359
Level [A] * Register [Fiction] -1.13 -1.95 – -0.30 0.008
Level [B] * Register [Fiction] -1.65 -2.46 – -0.85 <0.001
Level [C] * Register [Fiction] -1.59 -2.40 – -0.78 <0.001
Level [D] * Register [Fiction] -1.75 -2.56 – -0.95 <0.001
Level [E] * Register [Fiction] -1.92 -2.75 – -1.09 <0.001
Level [A] * Register [Informative] -0.90 -1.75 – -0.04 0.040
Level [B] * Register [Informative] -1.05 -1.88 – -0.22 0.014
Level [C] * Register [Informative] -0.99 -1.81 – -0.16 0.019
Level [D] * Register [Informative] -1.18 -2.01 – -0.36 0.005
Level [E] * Register [Informative] -1.15 -1.99 – -0.30 0.008
Random Effects
σ2 0.45
τ00 Source 0.16
ICC 0.26
N Source 325
Observations 4980
Marginal R2 / Conditional R2 0.232 / 0.432
# PC5 md_source <- lmer(PC5 ~ 1 + (Register|Source), res.ind, REML = FALSE) #
# Fails to converge
md_source <- lmer(PC5 ~ 1 + (1 | Source), res.ind, REML = FALSE)
md_corpus <- update(md_source, . ~ . + Level)
md_register <- update(md_corpus, . ~ . + Register)
md_both <- update(md_corpus, . ~ . + Register)
md_interaction <- update(md_both, . ~ . + Level:Register)

anova(md_source, md_corpus, md_register, md_both, md_interaction)
## Data: res.ind
## Models:
## md_source: PC5 ~ 1 + (1 | Source)
## md_corpus: PC5 ~ (1 | Source) + Level
## md_register: PC5 ~ (1 | Source) + Level + Register
## md_both: PC5 ~ (1 | Source) + Level + Register
## md_interaction: PC5 ~ (1 | Source) + Level + Register + Level:Register
##                npar   AIC   BIC  logLik deviance    Chisq Df Pr(>Chisq)    
## md_source         3 10937 10957 -5465.6    10931                           
## md_corpus         8 10748 10800 -5366.0    10732 199.3592  5  < 2.2e-16 ***
## md_register      10 10743 10808 -5361.5    10723   8.8428  2    0.01202 *  
## md_both          10 10743 10808 -5361.5    10723   0.0000  0               
## md_interaction   20 10724 10854 -5341.9    10684  39.2992 10   2.25e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(md_interaction, wrap.labels = 200)
  PC 5
Predictors Estimates CI p
(Intercept) -0.20 -1.14 – 0.74 0.677
Level [A] 1.18 0.18 – 2.18 0.020
Level [B] 0.82 -0.18 – 1.82 0.106
Level [C] 0.49 -0.51 – 1.48 0.339
Level [D] 0.34 -0.66 – 1.33 0.509
Level [E] 0.31 -0.70 – 1.32 0.544
Register [Fiction] 0.34 -0.60 – 1.28 0.476
Register [Informative] -0.09 -1.06 – 0.88 0.857
Level [A] * Register [Fiction] -0.75 -1.73 – 0.23 0.132
Level [B] * Register [Fiction] -0.73 -1.69 – 0.23 0.136
Level [C] * Register [Fiction] -0.43 -1.40 – 0.53 0.376
Level [D] * Register [Fiction] -0.38 -1.34 – 0.58 0.442
Level [E] * Register [Fiction] -0.43 -1.41 – 0.55 0.386
Level [A] * Register [Informative] 0.34 -0.67 – 1.36 0.506
Level [B] * Register [Informative] -0.11 -1.10 – 0.87 0.820
Level [C] * Register [Informative] 0.11 -0.88 – 1.09 0.831
Level [D] * Register [Informative] 0.04 -0.94 – 1.03 0.934
Level [E] * Register [Informative] -0.22 -1.23 – 0.78 0.662
Random Effects
σ2 0.46
τ00 Source 0.23
ICC 0.33
N Source 325
Observations 4980
Marginal R2 / Conditional R2 0.110 / 0.406
# PC6 md_source <- lmer(PC6 ~ 1 + (Register|Source), res.ind, REML = FALSE) #
# Fails to converge
md_source <- lmer(PC6 ~ 1 + (1 | Source), res.ind, REML = FALSE)
md_corpus <- update(md_source, . ~ . + Level)
md_register <- update(md_corpus, . ~ . + Register)
md_both <- update(md_corpus, . ~ . + Register)
md_interaction <- update(md_both, . ~ . + Level:Register)

anova(md_source, md_corpus, md_register, md_both, md_interaction)
## Data: res.ind
## Models:
## md_source: PC6 ~ 1 + (1 | Source)
## md_corpus: PC6 ~ (1 | Source) + Level
## md_register: PC6 ~ (1 | Source) + Level + Register
## md_both: PC6 ~ (1 | Source) + Level + Register
## md_interaction: PC6 ~ (1 | Source) + Level + Register + Level:Register
##                npar   AIC   BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## md_source         3 11014 11034 -5504.2    11008                         
## md_corpus         8 11014 11066 -5499.1    10998 10.197  5    0.06985 .  
## md_register      10 10983 11048 -5481.6    10963 35.146  2  2.334e-08 ***
## md_both          10 10983 11048 -5481.6    10963  0.000  0               
## md_interaction   20 10965 11095 -5462.4    10925 38.373 10  3.268e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(md_interaction, wrap.labels = 200)
  PC 6
Predictors Estimates CI p
(Intercept) 0.12 -0.60 – 0.84 0.749
Level [A] -0.31 -1.09 – 0.46 0.431
Level [B] -0.44 -1.21 – 0.33 0.259
Level [C] -0.50 -1.26 – 0.27 0.206
Level [D] -0.51 -1.28 – 0.26 0.191
Level [E] -0.90 -1.68 – -0.11 0.026
Register [Fiction] -0.06 -0.79 – 0.66 0.865
Register [Informative] -0.11 -0.85 – 0.64 0.782
Level [A] * Register [Fiction] -0.11 -0.89 – 0.66 0.773
Level [B] * Register [Fiction] 0.07 -0.68 – 0.82 0.860
Level [C] * Register [Fiction] 0.19 -0.56 – 0.94 0.622
Level [D] * Register [Fiction] 0.35 -0.40 – 1.10 0.364
Level [E] * Register [Fiction] 0.92 0.15 – 1.70 0.020
Level [A] * Register [Informative] 0.41 -0.39 – 1.21 0.318
Level [B] * Register [Informative] 0.39 -0.38 – 1.17 0.317
Level [C] * Register [Informative] 0.36 -0.41 – 1.13 0.361
Level [D] * Register [Informative] 0.46 -0.30 – 1.23 0.236
Level [E] * Register [Informative] 0.58 -0.21 – 1.38 0.149
Random Effects
σ2 0.49
τ00 Source 0.13
ICC 0.21
N Source 325
Observations 4980
Marginal R2 / Conditional R2 0.041 / 0.247
# Tweak plot aesthetics with: https://cran.r-project.org/web/packages/sjPlot/vignettes/custplot.html
# Colour customisation trick from: https://stackoverflow.com/questions/55598920/different-line-colors-in-forest-plot-output-from-sjplot-r-package

plot_model(md_interaction, 
           #type = "re", # Option to visualise random effects 
           show.intercept = TRUE,
           show.values=TRUE, 
           show.p=TRUE,
           value.offset = .4,
           value.size = 3.5,
           colors = palette[c(1:3,7:9)],
           group.terms = c(1,5,5,5,5,5,6,4,2,2,2,2,2,3,3,3,3,3), 
           title="Fixed effects",
           wrap.labels = 40,
           axis.title = "PC4 estimated coefficients") +
  theme_sjplot2() 

#ggsave(here("Plots", "TxBRef3Reg_PCA4_lmer_fixed.svg"), height = 6, width = 7)

#svg(here("Plots", "TxBReg3Reg_predicted_PC6_scores_interactions.svg"), height = 8, width = 9)
visreg(md_interaction, xvar = "Level", by="Register", 
       #type = "contrast",
       type = "conditional",
       line=list(col="darkred"), 
       points=list(cex=0.3),
       xlab = "Ref. corpora and textbook level (A to E)", ylab = "PC1",
       layout=c(3,1)
)

#dev.off()

md_subcorpus <- lmer(PC1 ~ Subcorpus + (1|Source), res.ind, REML = FALSE)
tab_model(md_subcorpus)
  PC 1
Predictors Estimates CI p
(Intercept) 3.71 2.35 – 5.07 <0.001
Subcorpus [Textbook
Conversation]
-2.14 -3.57 – -0.71 0.003
Subcorpus [Youth Fiction
Ref.]
-4.20 -5.56 – -2.84 <0.001
Subcorpus [Textbook
Fiction]
-4.27 -5.71 – -2.84 <0.001
Subcorpus [Info Teens
Ref.]
-6.75 -8.15 – -5.35 <0.001
Subcorpus [Textbook
Informative]
-5.81 -7.24 – -4.38 <0.001
Random Effects
σ2 0.63
τ00 Source 0.48
ICC 0.43
N Source 325
Observations 4980
Marginal R2 / Conditional R2 0.856 / 0.918
plot_model(md_subcorpus, 
           #type = "re", # Option to visualise random effects 
           show.intercept = TRUE,
           show.values=TRUE, 
           show.p=TRUE,
           value.offset = .4,
           value.size = 3.5,
           colors = palette[c(1:3,7:9)],
           group.terms = c(1,5,2,6,4,3), 
           wrap.labels = 40,
           axis.title = "PC1 estimated coefficients",
           title="Fixed effects") +
  theme_sjplot2()

subcorpus_results <- emmeans(md_subcorpus, "Subcorpus")
## Note: D.f. calculations have been disabled because the number of observations exceeds 3000.
## To enable adjustments, add the argument 'pbkrtest.limit = 4980' (or larger)
## [or, globally, 'set emm_options(pbkrtest.limit = 4980)' or larger];
## but be warned that this may result in large computation time and memory use.
## Note: D.f. calculations have been disabled because the number of observations exceeds 3000.
## To enable adjustments, add the argument 'lmerTest.limit = 4980' (or larger)
## [or, globally, 'set emm_options(lmerTest.limit = 4980)' or larger];
## but be warned that this may result in large computation time and memory use.
summary(subcorpus_results)
##  Subcorpus             emmean     SE  df asymp.LCL asymp.UCL
##  Spoken BNC2014 Ref.    3.710 0.6920 Inf     2.354     5.067
##  Textbook Conversation  1.573 0.2334 Inf     1.116     2.031
##  Youth Fiction Ref.    -0.491 0.0461 Inf    -0.581    -0.400
##  Textbook Fiction      -0.564 0.2361 Inf    -1.026    -0.101
##  Info Teens Ref.       -3.039 0.1800 Inf    -3.392    -2.686
##  Textbook Informative  -2.099 0.2347 Inf    -2.559    -1.639
## 
## Degrees-of-freedom method: asymptotic 
## Confidence level used: 0.95
#write_last_clip()